{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "43d2ea6b-6ec7-402f-a091-b7e97013c407",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-07-13T07:17:50.420858Z",
     "iopub.status.busy": "2024-07-13T07:17:50.420659Z",
     "iopub.status.idle": "2024-07-13T07:17:50.657039Z",
     "shell.execute_reply": "2024-07-13T07:17:50.656574Z",
     "shell.execute_reply.started": "2024-07-13T07:17:50.420838Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "import pickle\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "6c9f4122-aa57-49b3-bd54-dc4c5c7b98aa",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-07-13T07:17:51.168108Z",
     "iopub.status.busy": "2024-07-13T07:17:51.167858Z",
     "iopub.status.idle": "2024-07-13T07:17:51.170853Z",
     "shell.execute_reply": "2024-07-13T07:17:51.170534Z",
     "shell.execute_reply.started": "2024-07-13T07:17:51.168094Z"
    }
   },
   "outputs": [],
   "source": [
    "dt = '20240713'\n",
    "version = 'v1'\n",
    "\n",
    "output_dir = os.path.join('outputs', f'{version}_{dt}')\n",
    "os.makedirs(output_dir, exist_ok=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "87992b21-3165-4175-9fd7-eca3cef26fe1",
   "metadata": {},
   "source": [
    "# 加载文档片段"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "f059b69b-b8e0-4370-b026-c42c3351f203",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-07-13T07:17:52.407304Z",
     "iopub.status.busy": "2024-07-13T07:17:52.407110Z",
     "iopub.status.idle": "2024-07-13T07:17:53.835246Z",
     "shell.execute_reply": "2024-07-13T07:17:53.834773Z",
     "shell.execute_reply.started": "2024-07-13T07:17:52.407291Z"
    }
   },
   "outputs": [],
   "source": [
    "from langchain_community.document_loaders import PyPDFLoader\n",
    "\n",
    "loader = PyPDFLoader(\"data/2024全球经济金融展望报告.pdf\")\n",
    "documents = loader.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ff9e807a-669c-4023-8f6a-080f68ffb4a7",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-07-13T07:17:53.836130Z",
     "iopub.status.busy": "2024-07-13T07:17:53.835996Z",
     "iopub.status.idle": "2024-07-13T07:17:53.843534Z",
     "shell.execute_reply": "2024-07-13T07:17:53.843091Z",
     "shell.execute_reply.started": "2024-07-13T07:17:53.836117Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "found cache, restoring...\n"
     ]
    }
   ],
   "source": [
    "from uuid import uuid4\n",
    "import os\n",
    "import pickle\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "\n",
    "def split_docs(documents, filepath, chunk_size=400, chunk_overlap=40, seperators=['\\n\\n\\n', '\\n\\n'], force_split=False):\n",
    "    if os.path.exists(filepath) and not force_split:\n",
    "        print('found cache, restoring...')\n",
    "        return pickle.load(open(filepath, 'rb'))\n",
    "\n",
    "    splitter = RecursiveCharacterTextSplitter(\n",
    "        chunk_size=chunk_size,\n",
    "        chunk_overlap=chunk_overlap,\n",
    "        separators=seperators\n",
    "    )\n",
    "    split_docs = splitter.split_documents(documents)\n",
    "    for chunk in split_docs:\n",
    "        chunk.metadata['uuid'] = str(uuid4())\n",
    "\n",
    "    pickle.dump(split_docs, open(filepath, 'wb'))\n",
    "\n",
    "    return split_docs\n",
    "\n",
    "splitted_docs = split_docs(documents, os.path.join(output_dir, 'split_docs.pkl'), chunk_size=500, chunk_overlap=50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "85aa6ae7-ae2b-48f0-9005-a89c442da798",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-07-13T07:17:54.928228Z",
     "iopub.status.busy": "2024-07-13T07:17:54.928027Z",
     "iopub.status.idle": "2024-07-13T07:17:54.930652Z",
     "shell.execute_reply": "2024-07-13T07:17:54.930315Z",
     "shell.execute_reply.started": "2024-07-13T07:17:54.928215Z"
    }
   },
   "outputs": [],
   "source": [
    "uuid2doc = {doc.metadata['uuid']: doc for doc in splitted_docs}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "af6a39a1-3f66-40ee-8606-d5fff4b6a78d",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-07-13T07:17:55.264608Z",
     "iopub.status.busy": "2024-07-13T07:17:55.264407Z",
     "iopub.status.idle": "2024-07-13T07:17:55.269073Z",
     "shell.execute_reply": "2024-07-13T07:17:55.268602Z",
     "shell.execute_reply.started": "2024-07-13T07:17:55.264595Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "52"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(uuid2doc)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "28404415-6040-4865-a5e5-4d06e7a41161",
   "metadata": {},
   "source": [
    "# 加载抽取的QA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "5f3c2843-b4d3-498e-ac31-77aa53300a13",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-07-13T07:17:56.233162Z",
     "iopub.status.busy": "2024-07-13T07:17:56.232980Z",
     "iopub.status.idle": "2024-07-13T07:17:56.343410Z",
     "shell.execute_reply": "2024-07-13T07:17:56.342878Z",
     "shell.execute_reply.started": "2024-07-13T07:17:56.233149Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "qa_df = pd.read_excel(os.path.join(output_dir, f'question_answer.xlsx'))\n",
    "qa_df = qa_df[qa_df['dataset'] == 'train']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a158474b-c72d-4298-8afd-ccc86952eead",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-07-13T07:17:56.839887Z",
     "iopub.status.busy": "2024-07-13T07:17:56.839617Z",
     "iopub.status.idle": "2024-07-13T07:17:56.842948Z",
     "shell.execute_reply": "2024-07-13T07:17:56.842630Z",
     "shell.execute_reply.started": "2024-07-13T07:17:56.839873Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "290"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(qa_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "3aaf76f5-036a-49e9-972d-aac1af966a99",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-07-13T07:17:57.176784Z",
     "iopub.status.busy": "2024-07-13T07:17:57.176583Z",
     "iopub.status.idle": "2024-07-13T07:17:57.184647Z",
     "shell.execute_reply": "2024-07-13T07:17:57.184308Z",
     "shell.execute_reply.started": "2024-07-13T07:17:57.176770Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>uuid</th>\n",
       "      <th>question</th>\n",
       "      <th>answer</th>\n",
       "      <th>context</th>\n",
       "      <th>doc</th>\n",
       "      <th>qa_type</th>\n",
       "      <th>score</th>\n",
       "      <th>score_reason</th>\n",
       "      <th>dataset</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>e73a0c9d-d42b-4350-a4c3-b38bf67c68a5</td>\n",
       "      <td>美元指数在2023年的走势如何？</td>\n",
       "      <td>美元指数高位震荡后走弱。</td>\n",
       "      <td>美元指数高位震荡后走弱</td>\n",
       "      <td>研究院\\n全球经济金融展望报告\\n要点2024年年报（总第57期） 报告日期：2023年12...</td>\n",
       "      <td>detailed</td>\n",
       "      <td>5</td>\n",
       "      <td>问题询问的是具体事件的预测，是实时性的，而答案直接给出了对美元指数走势的判断，不是简单的文本描述。</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>41d95288-441d-4c02-948a-6a3f0f4ef3ba</td>\n",
       "      <td>2023年全球货物贸易量指数和价格指数有何变动趋势？</td>\n",
       "      <td>下行</td>\n",
       "      <td>全球货物贸易量指数和价格指数下行，主要经济体出口贸易同比增速下降。</td>\n",
       "      <td>全球经济金融展望报告\\n中国银行研究院 1 2024年\\n全球经济复苏疲软，货币政策取向分化...</td>\n",
       "      <td>detailed</td>\n",
       "      <td>5</td>\n",
       "      <td>问题询问的是未来的趋势，需要分析或预测，不是简单的事实复述。答案直接给出了预测结果，符合要求。</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>41d95288-441d-4c02-948a-6a3f0f4ef3ba</td>\n",
       "      <td>2023年欧美央行的货币政策趋势是什么？</td>\n",
       "      <td>延续收紧态势，但步伐整体放缓</td>\n",
       "      <td>欧美央行货币政策延续收紧态势，但步伐整体放缓。</td>\n",
       "      <td>全球经济金融展望报告\\n中国银行研究院 1 2024年\\n全球经济复苏疲软，货币政策取向分化...</td>\n",
       "      <td>detailed</td>\n",
       "      <td>5</td>\n",
       "      <td>问题询问的是实时的经济政策趋势，具有明确的针对性和价值。答案直接给出了具体的方向和特点，不依...</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                    uuid                    question  \\\n",
       "4   e73a0c9d-d42b-4350-a4c3-b38bf67c68a5            美元指数在2023年的走势如何？   \n",
       "11  41d95288-441d-4c02-948a-6a3f0f4ef3ba  2023年全球货物贸易量指数和价格指数有何变动趋势？   \n",
       "12  41d95288-441d-4c02-948a-6a3f0f4ef3ba        2023年欧美央行的货币政策趋势是什么？   \n",
       "\n",
       "            answer                            context  \\\n",
       "4     美元指数高位震荡后走弱。                        美元指数高位震荡后走弱   \n",
       "11              下行  全球货物贸易量指数和价格指数下行，主要经济体出口贸易同比增速下降。   \n",
       "12  延续收紧态势，但步伐整体放缓            欧美央行货币政策延续收紧态势，但步伐整体放缓。   \n",
       "\n",
       "                                                  doc   qa_type  score  \\\n",
       "4   研究院\\n全球经济金融展望报告\\n要点2024年年报（总第57期） 报告日期：2023年12...  detailed      5   \n",
       "11  全球经济金融展望报告\\n中国银行研究院 1 2024年\\n全球经济复苏疲软，货币政策取向分化...  detailed      5   \n",
       "12  全球经济金融展望报告\\n中国银行研究院 1 2024年\\n全球经济复苏疲软，货币政策取向分化...  detailed      5   \n",
       "\n",
       "                                         score_reason dataset  \n",
       "4   问题询问的是具体事件的预测，是实时性的，而答案直接给出了对美元指数走势的判断，不是简单的文本描述。   train  \n",
       "11    问题询问的是未来的趋势，需要分析或预测，不是简单的事实复述。答案直接给出了预测结果，符合要求。   train  \n",
       "12  问题询问的是实时的经济政策趋势，具有明确的针对性和价值。答案直接给出了具体的方向和特点，不依...   train  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qa_df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "2bdddb35-1f2f-4452-9e9f-3a1f58b8f944",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-07-13T07:17:57.537829Z",
     "iopub.status.busy": "2024-07-13T07:17:57.537617Z",
     "iopub.status.idle": "2024-07-13T07:17:57.541173Z",
     "shell.execute_reply": "2024-07-13T07:17:57.540787Z",
     "shell.execute_reply.started": "2024-07-13T07:17:57.537816Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "289"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qa_df['question'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "09f653b2-ddaa-46c6-9279-0b5311fd5bfc",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-07-13T07:17:59.268108Z",
     "iopub.status.busy": "2024-07-13T07:17:59.267914Z",
     "iopub.status.idle": "2024-07-13T07:17:59.271316Z",
     "shell.execute_reply": "2024-07-13T07:17:59.270928Z",
     "shell.execute_reply.started": "2024-07-13T07:17:59.268095Z"
    }
   },
   "outputs": [],
   "source": [
    "qa_df = qa_df.drop_duplicates('question')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "677e8d33-65f6-4722-899e-b8d3ad20f636",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-07-13T07:17:59.607326Z",
     "iopub.status.busy": "2024-07-13T07:17:59.607159Z",
     "iopub.status.idle": "2024-07-13T07:17:59.610434Z",
     "shell.execute_reply": "2024-07-13T07:17:59.609974Z",
     "shell.execute_reply.started": "2024-07-13T07:17:59.607312Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "289"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(qa_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "59ca40a4-e5f2-4914-9efd-3ae4a8656de0",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-07-13T07:17:59.899780Z",
     "iopub.status.busy": "2024-07-13T07:17:59.899585Z",
     "iopub.status.idle": "2024-07-13T07:17:59.903758Z",
     "shell.execute_reply": "2024-07-13T07:17:59.903445Z",
     "shell.execute_reply.started": "2024-07-13T07:17:59.899767Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "uuid             0\n",
       "question         0\n",
       "answer           0\n",
       "context         48\n",
       "doc              0\n",
       "qa_type          0\n",
       "score            0\n",
       "score_reason     0\n",
       "dataset          0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qa_df.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "ebc9b5f0-3555-4f36-aee0-5647f7195850",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-07-13T07:18:00.461187Z",
     "iopub.status.busy": "2024-07-13T07:18:00.461018Z",
     "iopub.status.idle": "2024-07-13T07:18:00.465681Z",
     "shell.execute_reply": "2024-07-13T07:18:00.465317Z",
     "shell.execute_reply.started": "2024-07-13T07:18:00.461174Z"
    }
   },
   "outputs": [],
   "source": [
    "def build_qa_samples(df, neg_batch_size=-1, n_neg_batch=5):\n",
    "    \"\"\"\n",
    "    构建qa样本\n",
    "    :param df: 包含qa的DataFrame，共两列，question和answer\n",
    "    :param neg_batch_size: 负样本数量，为-1时表示将所有负样本和单个正样本配对，否则会将负样本拆开，结果中的query可能会重复\n",
    "    \"\"\"\n",
    "    from tqdm.auto import tqdm\n",
    "    import math\n",
    "\n",
    "    data = []\n",
    "    for idx, row in tqdm(df.iterrows(), total=len(df)):\n",
    "        question = row['question']\n",
    "        answer = row['answer']\n",
    "        # 筛选同category的，增加难度\n",
    "        neg_samples = df[df['question'] != question]['answer'].values.tolist()\n",
    "        neg_batch_count = math.ceil((len(df) - 1) / neg_batch_size)\n",
    "        neg_batch_count = min(n_neg_batch, neg_batch_count)\n",
    "        for neg_batch_idx in range(neg_batch_count):\n",
    "            batch_neg_samples = neg_samples[neg_batch_idx * neg_batch_size: (neg_batch_idx + 1) * neg_batch_size]\n",
    "            batch_neg_samples = [item for item in batch_neg_samples if item != answer]\n",
    "            data.append({\n",
    "                'query': question,\n",
    "                'pos': [answer],\n",
    "                'neg': batch_neg_samples\n",
    "            })\n",
    "    return data\n",
    "\n",
    "def write_samples(samples, save_filename):\n",
    "    import json\n",
    "\n",
    "    with open(save_filename, 'w') as f:\n",
    "        for sample in samples:\n",
    "            f.write(json.dumps(sample, ensure_ascii=False))\n",
    "            f.write('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "afd9ef1d-7321-468c-8b44-a4c5b509e195",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-07-13T07:18:02.135815Z",
     "iopub.status.busy": "2024-07-13T07:18:02.135646Z",
     "iopub.status.idle": "2024-07-13T07:18:02.138940Z",
     "shell.execute_reply": "2024-07-13T07:18:02.138473Z",
     "shell.execute_reply.started": "2024-07-13T07:18:02.135801Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "289"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(qa_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "b845be72-32e5-4610-a394-acfa4ee2ee70",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-07-13T07:18:53.287644Z",
     "iopub.status.busy": "2024-07-13T07:18:53.287466Z",
     "iopub.status.idle": "2024-07-13T07:18:53.373966Z",
     "shell.execute_reply": "2024-07-13T07:18:53.373566Z",
     "shell.execute_reply.started": "2024-07-13T07:18:53.287631Z"
    }
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d6d48ba964554993a7836ceb3a6cb6bc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/241 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "qa_df = qa_df[qa_df['qa_type'] == 'detailed']\n",
    "qa_df['answer'] = qa_df['context']\n",
    "\n",
    "qd_samples = build_qa_samples(qa_df, neg_batch_size=16, n_neg_batch=32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "095ab72c-23f3-41f3-8368-68634480e55e",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-07-13T07:18:54.939134Z",
     "iopub.status.busy": "2024-07-13T07:18:54.938932Z",
     "iopub.status.idle": "2024-07-13T07:18:54.942451Z",
     "shell.execute_reply": "2024-07-13T07:18:54.941902Z",
     "shell.execute_reply.started": "2024-07-13T07:18:54.939120Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3615"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(qd_samples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "5061fd80-c76c-4d11-b71e-221b61eb169d",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-07-13T07:18:56.791151Z",
     "iopub.status.busy": "2024-07-13T07:18:56.790941Z",
     "iopub.status.idle": "2024-07-13T07:18:56.794655Z",
     "shell.execute_reply": "2024-07-13T07:18:56.794103Z",
     "shell.execute_reply.started": "2024-07-13T07:18:56.791138Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'query': '美元指数在2023年的走势如何？',\n",
       " 'pos': ['美元指数高位震荡后走弱'],\n",
       " 'neg': ['全球货物贸易量指数和价格指数下行，主要经济体出口贸易同比增速下降。',\n",
       "  '欧美央行货币政策延续收紧态势，但步伐整体放缓。',\n",
       "  '高利率环境抑制债券融资需求，债券违约风险持续上升，美国政府债务可持续性问题引发市场关注。',\n",
       "  '展望2024年，预计全球经济复苏将依旧疲软，主要经济体增长态势和货币政策将进一步分化。',\n",
       "  '发达经济体增速明显放缓，预计2023年增速较2022年下降1个百分点。',\n",
       "  '新兴经济体增速与2022年大致持平，预计2023年增速比2022年下降0.1个百分点。',\n",
       "  '美国GDP环比增长折年率为4.9%，比二季度增速高2.8个百分点。',\n",
       "  '其中，主要新兴经济体工业生产指数普遍走高，如俄罗斯、土耳其、南非等，而发达经济体中的美国和韩国回升，英国、德国、意大利下行，日本波动较大，整体趋于平稳。',\n",
       "  '主要新兴经济体工业生产指数普遍走高，如俄罗斯、土耳其、南非等',\n",
       "  '全球融资环境收紧和经济下行压力对工业生产前景带来较大影响',\n",
       "  '欧洲各国消费指数整体维持稳定（图4）',\n",
       "  '是上半年免于陷入衰退的主要动力。',\n",
       "  'OECD消费者信心指数从7月开始连续3个月回落',\n",
       "  '美国私人投资在2023年一季度触底后逐渐反弹',\n",
       "  '2023年二季度，欧元区固定资本形成总额环比增长0.1%，比一季度增速下降0.3个百分点',\n",
       "  '房地产对GDP环比增长拉动率转为负值']}"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qd_samples[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "db4fe418-a520-466e-ab37-8a38bf9719e1",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-07-13T07:19:00.294057Z",
     "iopub.status.busy": "2024-07-13T07:19:00.293852Z",
     "iopub.status.idle": "2024-07-13T07:19:01.227561Z",
     "shell.execute_reply": "2024-07-13T07:19:01.227060Z",
     "shell.execute_reply.started": "2024-07-13T07:19:00.294044Z"
    }
   },
   "outputs": [],
   "source": [
    "write_samples(qd_samples, os.path.join(output_dir, 'emb_samples_qd_v2.jsonl'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39f716f3-ee5c-4d0f-a522-5d32e6311160",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
